reticulate::use_python("/anaconda3/bin/python")

library(readr)
library(ggplot2)
library(dplyr)
library(methods)
library(keras)
theme_set(theme_minimal())

MNIST, once again

You’ll work with a different dataset for the lab, but let’s use MNIST once more for today’s notes:

mnist <- read_csv("https://statsmaths.github.io/ml_data/mnist_10.csv")
## Parsed with column specification:
## cols(
##   obs_id = col_character(),
##   train_id = col_character(),
##   class = col_double(),
##   class_name = col_double()
## )
x28 <- read_rds("image_data/mnist_10_x28.rds")

X <- t(apply(x28, 1, cbind))
y <- mnist$class

X_train <- X[mnist$train_id == "train",]
y_train <- to_categorical(y[mnist$train_id == "train"], num_classes = 10)

Again, the task is to classify the image as belonging to one of 10 digits (0-9).

Neural Network Regularization

With MNIST, a small neural network goes a long way. The real power of neural network though begin to come into play when working with larger and deeping networks. We have already seen that it is easy to build these with the keras package. For example, here is a model with four hidden layers:

model <- keras_model_sequential()
model %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal",
              input_shape = c(28^2)) %>%
  layer_activation(activation = "relu") %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%

  layer_dense(units = 10) %>%
  layer_activation(activation = "softmax")

model %>% compile(loss = 'categorical_crossentropy',
                  optimizer = optimizer_sgd(lr = 0.01),
                  metrics = c('accuracy'))

model
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_1 (Dense)                  (None, 512)                   401920      
## ___________________________________________________________________________
## activation_1 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 512)                   262656      
## ___________________________________________________________________________
## activation_2 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dense_3 (Dense)                  (None, 512)                   262656      
## ___________________________________________________________________________
## activation_3 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dense_4 (Dense)                  (None, 512)                   262656      
## ___________________________________________________________________________
## activation_4 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dense_5 (Dense)                  (None, 10)                    5130        
## ___________________________________________________________________________
## activation_5 (Activation)        (None, 10)                    0           
## ===========================================================================
## Total params: 1,195,018
## Trainable params: 1,195,018
## Non-trainable params: 0
## ___________________________________________________________________________

The new model now has over 1 million parameters. However, if we have too many parameters, the model will begin to overfit.

history <- model %>%
  fit(X_train, y_train, epochs = 16,
      validation_split = 0.1, batch_size = 32)
plot(history)

We need to regularize the model much like we did with the elastic net. One way to regularize a neural network is to include a dropout layer. This layer, during training only, randomly converts a proportion of its inputs to zero. This forces the next layer to smooth out weights over all of the inputs in the prior layer. It is similar to when random forests only allow a random subset of the variable to be used at a given branching point or how the weights in a ridge regression force the model to spread out over correlated values.

Dropout can be added to the model with the layer_dropout function. Here, we also change the initializing function for the starting weights B in the neural network. I find that “glorot_normal” often performs better when using dropout.

model <- keras_model_sequential()
model %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal",
              input_shape = c(28^2)) %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 10) %>%
  layer_activation(activation = "softmax")

model %>% compile(loss = 'categorical_crossentropy',
                  optimizer = optimizer_sgd(lr = 0.01),
                  metrics = c('accuracy'))

model
## Model
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_6 (Dense)                  (None, 512)                   401920      
## ___________________________________________________________________________
## activation_6 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dropout_1 (Dropout)              (None, 512)                   0           
## ___________________________________________________________________________
## dense_7 (Dense)                  (None, 512)                   262656      
## ___________________________________________________________________________
## activation_7 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dropout_2 (Dropout)              (None, 512)                   0           
## ___________________________________________________________________________
## dense_8 (Dense)                  (None, 512)                   262656      
## ___________________________________________________________________________
## activation_8 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dropout_3 (Dropout)              (None, 512)                   0           
## ___________________________________________________________________________
## dense_9 (Dense)                  (None, 512)                   262656      
## ___________________________________________________________________________
## activation_9 (Activation)        (None, 512)                   0           
## ___________________________________________________________________________
## dropout_4 (Dropout)              (None, 512)                   0           
## ___________________________________________________________________________
## dense_10 (Dense)                 (None, 10)                    5130        
## ___________________________________________________________________________
## activation_10 (Activation)       (None, 10)                    0           
## ===========================================================================
## Total params: 1,195,018
## Trainable params: 1,195,018
## Non-trainable params: 0
## ___________________________________________________________________________

We can train it just as with the simpler neural networks.

history <- model %>%
  fit(X_train, y_train, epochs = 20,
      validation_split = 0.1, batch_size = 32)
plot(history)

Why is the training accuracy lower than the validation accuracy, particularly for the first few epochs? The reason is that the training accuracy uses the dropout functions but the validation accuracy turns the dropout off.

Neural network training algorithm

From the plot, it looks like the model is not improving much after about 10 iterations through the dataset. It is possible to train the model with a smaller learning rate to get a slightly better performance. Note that training the model again without redefining the model will start where the previous model left off.

model %>% compile(loss = 'categorical_crossentropy',
                  optimizer = optimizer_sgd(lr = 0.003),
                  metrics = c('accuracy'))
history <- model %>%
  fit(X_train, y_train, epochs = 2,
      validation_split = 0.1, batch_size = 32)
plot(history)

Another approach is to modify the SGD algorithm itself. Consider the standard updates in gradient descent:

\[ w_{new} \leftarrow w_{old} - \rho \cdot \nabla_w f \]

A common alternative stores an additional momentum vector:

\[ \Delta w_{new} \leftarrow \mu \cdot \Delta w_{old} - \rho \cdot \nabla_w f \] \[ w_{new} \leftarrow w_{old} + \Delta w_{new} \]

This allows us to approximate the Hessian matrix while only storing an extra p variables (rather than p squared). To implement in R we can do this:

model <- keras_model_sequential()
model %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal",
              input_shape = c(28^2)) %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 10) %>%
  layer_activation(activation = "softmax")

model %>% compile(loss = 'categorical_crossentropy',
                  optimizer = optimizer_sgd(lr = 0.01, momentum = 0.9,
                                            nesterov = TRUE),
                  metrics = c('accuracy'))
history <- model %>%
  fit(X_train, y_train, epochs = 2,
      validation_split = 0.1, batch_size = 32)
plot(history)

Finally, can we generalize the approach of changing the learning rate, slowly making the learning rate smaller as we become stuck? Yes! Just use a different optimizer. The RMSprop updates the learning rate as a function of the improvement in each step:

model <- keras_model_sequential()
model %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal",
              input_shape = c(28^2)) %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512, kernel_initializer = "glorot_normal") %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 10) %>%
  layer_activation(activation = "softmax")

model %>% compile(loss = 'categorical_crossentropy',
                  optimizer = optimizer_rmsprop(lr = 0.001),
                  metrics = c('accuracy'))
history <- model %>%
  fit(X_train, y_train, epochs = 10,
      validation_split = 0.1, batch_size = 32)

I find RMSprop to be OK on dense neural networks, but not as strong as SGD with momentum.

Tuning Neural Networks

The next lab has you take your turn at building a neural network on a dataset very similar to MNIST. There are a number of parameters that you can change in the neural network. Here are the general rules I would follow at this point:

  • run a model for how ever many epochs (iterations) it takes to visually flatline
  • always build layers by putting together a layer_dense, layer_activation (“relu” on all but the last, which should be “softmax”) and layer_dropout
  • use optimizer_sgd with nesterov = TRUE and momentum equal to 0.9
  • make all hidden layers have the same number of nodes
  • start with 2 hidden layers with 128 nodes each and an lr = 0.01

Then, you can change the following parameters as you test which ones work the best. Make sure that you only change one thing at a time and run enough epochs to get the model to roughly converge

  • experiment with between 2 and 5 hidden layers
  • try doubling or halving the number of hidden nodes; usually work in powers of 2
  • if the model is very noisy, you can decrease the dropout to 0.25
  • try successively halving the learning rate

Try to start small and work up slowly so you do not crash R (or, at least, you don’t crash R too frequently).

Flowers, once again

Let’s look again at the flowers dataset. First, we load the metadata. This is exactly the same as any other dataset in which we pull in a CSV file from GitHub:

flowers <- read_csv("https://statsmaths.github.io/ml_data/flowers_17.csv")
flowers
## # A tibble: 1,360 x 4
##    obs_id    train_id class class_name
##    <chr>     <chr>    <dbl> <chr>     
##  1 id_000362 valid        4 crocus    
##  2 id_000506 train        6 tigerlily 
##  3 id_000778 valid        9 sunflower 
##  4 id_001233 train       15 windflower
##  5 id_000274 train        3 bluebell  
##  6 id_001218 train       15 windflower
##  7 id_001280 test        NA <NA>      
##  8 id_000895 train       11 colts foot
##  9 id_000851 valid       10 daisy     
## 10 id_000084 train        1 snowdrop  
## # … with 1,350 more rows

Then, we also have to grab the image data itself. To do this, first download the dataset here:

Save it somewhere on your computer and then read it into R:

x64 <- read_rds("image_data/flowers_17_x64.rds")

I again only want to look at the first 10 types of flowers.

x64 <- x64[flowers$class %in% 0:9,,,]
flowers <- flowers[flowers$class %in% 0:9,]
fnames <- flowers$class_name[match(0:9, flowers$class)]
fnames <- factor(fnames, levels = fnames)

If we want to improve our model further beyond dense neural networks, we need to include information beyond just the color of the flower. When we look at the images, our brains also use information about shape and texture. Let’s try to find a way to measure this in the image.

I will start by taking a sample flower image and creating a black and white version of it. A simple way to do this is to average the red, green, and blue pixels.

i <- 50
bw <- (x64[i,,,1] + x64[i,,,2] + x64[i,,,3]) / 3
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n")
rasterImage(bw,0,0,1,1)

To detect texture we can take the brightness of each pixel and subtract it from the brightness of the pixel to its lower right. We can do this in a vectorized fashion as such:

edge <- abs(bw[-1,-1] - bw[-nrow(bw),-ncol(bw)])
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n")
rasterImage(edge,0,0,1,1)

The resulting image roughly detects edges in the image. Notice that is has only 63-by-63 pixels due to the fact that we cannot compute this measurement on the rightmost or bottommost edges of the plot.

We’ll do this for each image, and save the number of pixels that have an edge value greater than 0.1. You could of course play around with this cutoff, or save a number of different cutoff values. This number will tell us roughly how much of the image consists of edges. A low number indicates a smooth petal and a a high one indicates a grassy texture to the flower.

mean_edge <- rep(0, nrow(flowers))
for (i in seq_len(nrow(flowers))) {
  bw <- (x64[i,,,1] + x64[i,,,2] + x64[i,,,3]) / 3
  edge <- abs(bw[-1,-1] - bw[-nrow(bw),-ncol(bw)])
  mean_edge[i] <- mean(edge > 0.1)
}

A boxplot shows that there are differences between the flowers in this measurement. Crocuses in particular have a lot of edges.

qplot(flowers$class_name, mean_edge, geom = "blank") +
  geom_boxplot() +
  coord_flip() +
  theme_minimal()

Most of the photos have a flower in the middle, but the background may include grass, sky, or other non-related elements. Let’s repeat the edge detector but now only such as the degree of edge-ness only for the middle of the image.

mean_edge_mid <- rep(0, nrow(flowers))
for (i in seq_len(nrow(flowers))) {
  bw <- (x64[i,,,1] + x64[i,,,2] + x64[i,,,3]) / 3
  edge <- abs(bw[-1,-1] - bw[-nrow(bw),-ncol(bw)])
  mean_edge_mid[i] <- mean(edge[20:44,20:44] > 0.1)
}

This shows a clearly differentiation of the flowers. Fritillary have a lot of edges due to their spots in the middle of the photo. Notice that the patterns here are quite different from those in the whole image.

qplot(flowers$class_name, mean_edge_mid, geom = "blank") +
  geom_boxplot() +
  coord_flip() +
  theme_minimal()

We will create a data matrix by putting together the color information with the mean_edge and mean_edge_mid metrics.

color_vals <- c(hsv(1, 0, seq(0, 1, by = 0.2)),
                hsv(seq(0, 0.9, by = 0.1), 1, 1))

X_hsv <- matrix(0, ncol = length(color_vals),
                   nrow = nrow(flowers))
for (i in seq_len(nrow(flowers))) {
  red <- as.numeric(x64[i,,,1])
  green <- as.numeric(x64[i,,,2])
  blue <- as.numeric(x64[i,,,3])
  hsv <- t(rgb2hsv(red, green, blue, maxColorValue = 1))

  color <- rep("#000000", nrow(hsv))

  index <- which(hsv[,2] < 0.2)
  color[index] <- hsv(1, 0, round(hsv[index,3] * 5) / 5)

  index <- which(hsv[,2] > 0.2 & hsv[,3] > 0.2)
  color[index] <- hsv(round(hsv[index,1],1), 1, 1)

  X_hsv[i,] <- table(factor(color, levels = color_vals))
}
X_edge <- cbind(X_hsv, mean_edge, mean_edge_mid)
y <- flowers$class

X_train <- X_edge[flowers$train_id == "train",]
X_valid <- X_edge[flowers$train_id == "valid",]
y_train <- y[flowers$train_id == "train"]
y_valid <- y[flowers$train_id == "valid"]

library(glmnet)
model <- cv.glmnet(X_train, y_train, family = "multinomial",
                   alpha = 0.2)
plot(model)

I’ve included the cross-validation curve because it is a perfect textbook example of what the curve should look like (but rarely does so nicely). The resulting model performs better than the color alone.

pred <- as.numeric(predict(model, newx = X_edge,
                           type = "class"))
tapply(pred == y, flowers$train_id, mean)
##  train  valid 
## 0.6225 0.4500

A confusion matrix shows us that only a few flowers are still difficult to differentiate.

table(pred = fnames[pred[flowers$train_id == "valid"] + 1],
      y = y[flowers$train_id == "valid"])
##              y
## pred           0  1  2  3  4  5  6  7  8  9
##   daffodil     7  1  1  1  0  1  0  1  1  5
##   snowdrop     3 12  5  1  1  1  3  6  1  0
##   lily valley  1  2 12  3  0  0  5  3  1  1
##   bluebell     0  0  0  8  1  5  0  0  3  0
##   crocus       0  1  0  3 15  2  1  3  0  0
##   iris         0  0  1  0  0  7  0  0  2  0
##   tigerlily    2  3  1  1  2  0  9  3  2  0
##   tulip        5  1  0  0  0  1  1  1  2  3
##   fritillary   0  0  0  1  1  2  0  0  8  0
##   sunflower    2  0  0  2  0  1  1  3  0 11

The next step would be to figure out what features would help distinguish the “snowdrop”, “daffodil”, and “bluebell” flowers from the others as false positives and negatives from these groups are causing a large portion of the remaining errors.